## Netflix Release Year
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import linregress
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from api_keys import netflix_api_key
# make paths and read csv raw data
data_path = "imdb.csv"
data = pd.read_csv(data_path)
data.head()
| Unnamed: 0 | title | year | kind | genre | rating | vote | country | language | runtime | cast | director | composer | writer | runtimes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Dinosaur Planet | 2003.0 | tv mini series | ['Documentary', 'Animation', 'Family'] | 7.7 | 474.0 | ['United States'] | ['English'] | ['50'] | ['Christian Slater', 'Scott Sampson'] | NaN | NaN | ['Mike Carrol', 'Mike Carroll', 'Georgann Kane'] | NaN |
| 1 | 1 | Character | 2021.0 | movie | ['Crime', 'Horror', 'Thriller'] | 8.3 | 46.0 | ['Japan'] | ['Japanese'] | ['125'] | ['Masaki Suda', 'Fukase', 'Mitsuki Takahata', ... | ['Akira Nagai'] | ['Youki Kojima'] | ['Takashi Nagasaki', 'Takashi Nagasaki', 'Anna... | NaN |
| 2 | 2 | Get Up and Dance! | 1994.0 | video movie | ['Family'] | 8.1 | 18.0 | ['United States'] | ['English'] | ['54'] | ['Paula Abdul', 'Aurorah Allain', 'Bill Bohl',... | ['Steve Purcell'] | NaN | NaN | NaN |
| 3 | 3 | The Rise and Fall of El Chapo | 2016.0 | tv movie | ['Documentary'] | 6.9 | 42.0 | ['United States'] | NaN | ['85'] | NaN | NaN | NaN | NaN | NaN |
| 4 | 4 | Sick - IMDb | NaN | NaN | ['Thriller'] | NaN | NaN | ['United States'] | NaN | NaN | ['Marc Menchaca', 'Gideon Adlon', 'Dylan Spray... | ['John Hyams'] | NaN | ['Katelyn Crabb', 'Kevin Williamson'] | NaN |
#get needed columns
year_data_df = data[['title','year','rating','vote']]
year_data_df.head()
| title | year | rating | vote | |
|---|---|---|---|---|
| 0 | Dinosaur Planet | 2003.0 | 7.7 | 474.0 |
| 1 | Character | 2021.0 | 8.3 | 46.0 |
| 2 | Get Up and Dance! | 1994.0 | 8.1 | 18.0 |
| 3 | The Rise and Fall of El Chapo | 2016.0 | 6.9 | 42.0 |
| 4 | Sick - IMDb | NaN | NaN | NaN |
#drop and null values
year_data_df = year_data_df.dropna()
year_data_df.head(20)
| title | year | rating | vote | |
|---|---|---|---|---|
| 0 | Dinosaur Planet | 2003.0 | 7.7 | 474.0 |
| 1 | Character | 2021.0 | 8.3 | 46.0 |
| 2 | Get Up and Dance! | 1994.0 | 8.1 | 18.0 |
| 3 | The Rise and Fall of El Chapo | 2016.0 | 6.9 | 42.0 |
| 5 | 8 Man | 1992.0 | 5.5 | 93.0 |
| 6 | What the #$*! Do We (K)now!? | 2004.0 | 5.3 | 13432.0 |
| 7 | Class of Nuke 'Em High Part II: Subhumanoid Me... | 1991.0 | 4.5 | 2177.0 |
| 8 | The Fighter | 2010.0 | 7.8 | 351199.0 |
| 11 | Neil Diamond: Greatest Hits Live | 1988.0 | 8.0 | 81.0 |
| 12 | 7 Seconds | 2005.0 | 4.8 | 7153.0 |
| 13 | By Dawn's Early Light | 1990.0 | 7.1 | 2898.0 |
| 14 | Seeta Aur Geeta | 1972.0 | 6.8 | 1818.0 |
| 15 | Strange Relations | 2001.0 | 7.6 | 768.0 |
| 16 | Sesame Street Presents: The Street We Live On | 2004.0 | 7.2 | 67.0 |
| 17 | Lilo and Stitch | 2012.0 | 7.2 | 22.0 |
| 18 | Boycott | 2001.0 | 7.2 | 901.0 |
| 19 | Meat Loaf: Bat Out of Hell | 1999.0 | 7.7 | 286.0 |
| 20 | Aqua Teen Hunger Force | 2000.0 | 7.6 | 23785.0 |
| 21 | FernGully 2: The Magical Rescue | 1998.0 | 4.6 | 1484.0 |
| 22 | Lady Chatterley | 1993.0 | 6.9 | 1605.0 |
# dropping ALL duplicate values
year_data_df.drop_duplicates(subset ="title", keep = 'first', inplace = True)
#check for earliest date
earliestyear = year_data_df['year'].min()
earliestyear
1914.0
#create the bins to separate the age group and label the bins
bins = [0, 1939.9, 1949.9, 1959.9, 1969.9, 1979.9, 1989.9, 1999.9, 2009.9, 2019.9, 3000]
bin_names = ["Before 1940", "1940-1950", "1950-1960", "1960-1970", "1970-1980", "1980-1990", "1990-2000", "2000-2010", "2010-2020", "Post 2020"]
#add column to dataframe that describes that shows bins
year_data_df["Year Group"] = pd.cut(year_data_df["year"], bins, labels=bin_names, include_lowest=True)
year_data_df.head(20)
| title | year | rating | vote | Year Group | |
|---|---|---|---|---|---|
| 0 | Dinosaur Planet | 2003.0 | 7.7 | 474.0 | 2000-2010 |
| 1 | Character | 2021.0 | 8.3 | 46.0 | Post 2020 |
| 2 | Get Up and Dance! | 1994.0 | 8.1 | 18.0 | 1990-2000 |
| 3 | The Rise and Fall of El Chapo | 2016.0 | 6.9 | 42.0 | 2010-2020 |
| 5 | 8 Man | 1992.0 | 5.5 | 93.0 | 1990-2000 |
| 6 | What the #$*! Do We (K)now!? | 2004.0 | 5.3 | 13432.0 | 2000-2010 |
| 7 | Class of Nuke 'Em High Part II: Subhumanoid Me... | 1991.0 | 4.5 | 2177.0 | 1990-2000 |
| 8 | The Fighter | 2010.0 | 7.8 | 351199.0 | 2010-2020 |
| 11 | Neil Diamond: Greatest Hits Live | 1988.0 | 8.0 | 81.0 | 1980-1990 |
| 12 | 7 Seconds | 2005.0 | 4.8 | 7153.0 | 2000-2010 |
| 13 | By Dawn's Early Light | 1990.0 | 7.1 | 2898.0 | 1990-2000 |
| 14 | Seeta Aur Geeta | 1972.0 | 6.8 | 1818.0 | 1970-1980 |
| 15 | Strange Relations | 2001.0 | 7.6 | 768.0 | 2000-2010 |
| 16 | Sesame Street Presents: The Street We Live On | 2004.0 | 7.2 | 67.0 | 2000-2010 |
| 17 | Lilo and Stitch | 2012.0 | 7.2 | 22.0 | 2010-2020 |
| 18 | Boycott | 2001.0 | 7.2 | 901.0 | 2000-2010 |
| 19 | Meat Loaf: Bat Out of Hell | 1999.0 | 7.7 | 286.0 | 1990-2000 |
| 20 | Aqua Teen Hunger Force | 2000.0 | 7.6 | 23785.0 | 2000-2010 |
| 21 | FernGully 2: The Magical Rescue | 1998.0 | 4.6 | 1484.0 | 1990-2000 |
| 22 | Lady Chatterley | 1993.0 | 6.9 | 1605.0 | 1990-2000 |
#GRAPH FOR DISTRIBUTION OF YEARS
#group by year group
yeargroup_df = year_data_df.groupby(['Year Group'])
ygcount_df = pd.DataFrame(yeargroup_df['year'].count())
#create bar chart
yeargroup_bar = ygcount_df.plot(kind="bar", title="Distribution of Movies by Year", color="crimson", legend=False)
#label the chart
yeargroup_bar.set_xlabel("Years")
yeargroup_bar.set_ylabel("Number of Movie")
plt.tight_layout
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
#sort data by rating
topmovies_df = year_data_df.sort_values("rating", ascending=False)
topmovies_df.head()
| title | year | rating | vote | Year Group | |
|---|---|---|---|---|---|
| 3848 | Dragon Family | 2004.0 | 9.6 | 8.0 | 2000-2010 |
| 2021 | Mortal Kombat: At The Movies | 2012.0 | 9.5 | 6.0 | 2010-2020 |
| 7655 | Band of Brothers | 2001.0 | 9.4 | 398551.0 | 2000-2010 |
| 2906 | The Sopranos: Season 6 Invitation to the Set | 2005.0 | 9.4 | 47.0 | 2000-2010 |
| 1185 | Widespread Panic: The Earth Will Swallow You | 2002.0 | 9.3 | 13.0 | 2000-2010 |
#GRAPH FOR AVERAGE VOTES PER MOVIE BY YEAR
topyear_df = topmovies_df.groupby(['Year Group'])
tycount_df = pd.DataFrame(topyear_df['vote'].mean())
#create bar chart
top25_bar = tycount_df.plot(kind="bar", title="Average Votes per Movie by Year", color="crimson", legend=False)
#label the chart
top25_bar.set_xlabel("Years")
top25_bar.set_ylabel("Average Votes per Movie")
plt.tight_layout
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
top25_df = topmovies_df[:25]
top25_df
| title | year | rating | vote | Year Group | |
|---|---|---|---|---|---|
| 3848 | Dragon Family | 2004.0 | 9.6 | 8.0 | 2000-2010 |
| 2021 | Mortal Kombat: At The Movies | 2012.0 | 9.5 | 6.0 | 2010-2020 |
| 7655 | Band of Brothers | 2001.0 | 9.4 | 398551.0 | 2000-2010 |
| 2906 | The Sopranos: Season 6 Invitation to the Set | 2005.0 | 9.4 | 47.0 | 2000-2010 |
| 1185 | Widespread Panic: The Earth Will Swallow You | 2002.0 | 9.3 | 13.0 | 2000-2010 |
| 698 | Yanni: Live at the Acropolis | 1994.0 | 9.3 | 410.0 | 1990-2000 |
| 652 | The Shawshank Redemption | 1994.0 | 9.3 | 2461873.0 | 1990-2000 |
| 2807 | Queensrÿche: Operation Livecrime | 1991.0 | 9.3 | 146.0 | 1990-2000 |
| 282 | Blue Planet II | 2017.0 | 9.3 | 36474.0 | 2010-2020 |
| 2600 | Pride 25: Body Blow | 2003.0 | 9.3 | 8.0 | 2000-2010 |
| 3360 | 311: Live in Concert, New Orleans - 3-11 Day 2004 | 2004.0 | 9.3 | 113.0 | 2000-2010 |
| 1224 | Smallville Season 3 Promo | 2003.0 | 9.2 | 58.0 | 2000-2010 |
| 9183 | Depeche Mode: Devotional | 1993.0 | 9.2 | 914.0 | 1990-2000 |
| 7475 | The World at War | 1973.0 | 9.2 | 23729.0 | 1970-1980 |
| 4662 | Behind the Scenes: One Tree Hill Season 6 | 2009.0 | 9.2 | 43.0 | 2000-2010 |
| 1657 | The Godfather | 1972.0 | 9.2 | 1702698.0 | 1970-1980 |
| 7065 | Roy Orbison: Black and White Night 30 | 2017.0 | 9.2 | 52.0 | 2010-2020 |
| 753 | Dream Theater: Live at Budokan | 2004.0 | 9.2 | 713.0 | 2000-2010 |
| 6047 | The Lex Series Laptop | 2019.0 | 9.2 | 8.0 | 2010-2020 |
| 5317 | Baseball | 1994.0 | 9.2 | 4028.0 | 1990-2000 |
| 7187 | Nine Inch Nails Live: And All That Could Have ... | 2002.0 | 9.2 | 1251.0 | 2000-2010 |
| 4812 | Game of Thrones | 2011.0 | 9.2 | 1873254.0 | 2010-2020 |
| 3812 | Pizza Delivery/Home Sweet Pineapple | 1999.0 | 9.2 | 1441.0 | 1990-2000 |
| 6558 | Selena Live: The Last Concert | 1995.0 | 9.2 | 175.0 | 1990-2000 |
| 2277 | Carly Simon Live from Martha's Vineyard | 1987.0 | 9.1 | 17.0 | 1980-1990 |
## Netflix Kind Values
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import linregress
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
netflix_pd = pd.read_csv("imdb.csv")
# remove duplicate movies
netflix_pd.drop_duplicates(subset ="title", keep = "first", inplace = True)
# clean and drop NAN in reduced data set
netflix_reduced = netflix_pd.loc[:, ["kind", "rating", "vote"]]
netflix_clean = netflix_reduced.dropna(how="any")
# assign kind values and unique labels
kind_unique = netflix_clean["kind"].unique()
kind_values = netflix_clean["kind"].value_counts()
# graph all the kinds distribution
# define parameters of the graph
labels = ['movie', 'tv short', 'video movie', 'tv movie', 'tv series', 'episode', 'tv mini series', 'video game']
sizes = [5213, 10, 1191, 744, 583, 469, 255, 17]
seperate = (0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01)
# plot the pie chart
plt.figure(figsize=(10,10))
plt.pie(sizes, explode=seperate, autopct="%1.1f%%", labels=labels)
plt.legend(loc="upper right")
plt.axis("equal")
plt.title("Netflix Kind Distribution")
plt.show()
movie_rating = netflix_clean.loc[netflix_clean["kind"] == "movie", "rating"]
video_rating = netflix_clean.loc[netflix_clean["kind"] == "video movie", "rating"]
movie_rating_mean = movie_rating.mean()
video_rating_mean = video_rating.mean()
# define parameters
labels = ["Digital Movies", "Video Movies"]
ratings = [movie_rating_mean, video_rating_mean]
# graph bar chart
plt.bar(labels, ratings, color="red", alpha=0.5, align="center")
plt.title("Digital Movie vs Video Movie Ratings")
plt.xlabel("Movie Type")
plt.ylabel("IMDB Ratings")
plt.ylim(0,10)
plt.show()
## Netflix Genre Types
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
netflix_pd = pd.read_csv("Netflix_Data.csv")
data_path = "Netflix_Data.csv"
data = pd.read_csv(data_path)
df = pd.read_csv("Netflix_Data.csv")
df.head()
| Title | Genre | Tags | Languages | Series or Movie | Hidden Gem Score | Country Availability | Runtime | Director | Writer | ... | Netflix Release Date | Production House | Netflix Link | IMDb Link | Summary | IMDb Votes | Image | Poster | TMDb Trailer | Trailer Site | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Lets Fight Ghost | Crime, Drama, Fantasy, Horror, Romance | Comedy Programmes,Romantic TV Comedies,Horror ... | Swedish, Spanish | Series | 4.3 | Thailand | < 30 minutes | Tomas Alfredson | John Ajvide Lindqvist | ... | 2021-03-04 | Canal+, Sandrew Metronome | https://www.netflix.com/watch/81415947 | https://www.imdb.com/title/tt1139797 | A med student with a supernatural gift tries t... | 205926.0 | https://occ-0-4708-64.1.nflxso.net/dnm/api/v6/... | https://m.media-amazon.com/images/M/MV5BOWM4NT... | NaN | NaN |
| 1 | HOW TO BUILD A GIRL | Comedy | Dramas,Comedies,Films Based on Books,British | English | Movie | 7.0 | Canada | 1-2 hour | Coky Giedroyc | Caitlin Moran | ... | 2021-03-04 | Film 4, Monumental Pictures, Lionsgate | https://www.netflix.com/watch/81041267 | https://www.imdb.com/title/tt4193072 | When nerdy Johanna moves to London, things get... | 2838.0 | https://occ-0-1081-999.1.nflxso.net/dnm/api/v6... | https://m.media-amazon.com/images/M/MV5BZGUyN2... | https://www.youtube.com/watch?v=eIbcxPy4okQ | YouTube |
| 2 | Centigrade | Drama, Thriller | Thrillers | English | Movie | 6.4 | Canada | 1-2 hour | Brendan Walsh | Brendan Walsh, Daley Nixon | ... | 2021-03-04 | NaN | https://www.netflix.com/watch/81305978 | https://www.imdb.com/title/tt8945942 | Trapped in a frozen car during a blizzard, a p... | 1720.0 | https://occ-0-1081-999.1.nflxso.net/dnm/api/v6... | https://m.media-amazon.com/images/M/MV5BODM2MD... | https://www.youtube.com/watch?v=0RvV7TNUlkQ | YouTube |
| 3 | ANNE+ | Drama | TV Dramas,Romantic TV Dramas,Dutch TV Shows | Turkish | Series | 7.7 | Belgium,Netherlands | < 30 minutes | NaN | NaN | ... | 2021-03-04 | NaN | https://www.netflix.com/watch/81336456 | https://www.imdb.com/title/tt6132758 | Upon moving into a new place, a 20-something r... | 1147.0 | https://occ-0-1489-1490.1.nflxso.net/dnm/api/v... | https://m.media-amazon.com/images/M/MV5BNWRkMz... | NaN | NaN |
| 4 | Moxie | Animation, Short, Drama | Social Issue Dramas,Teen Movies,Dramas,Comedie... | English | Movie | 8.1 | Lithuania,Poland,France,Iceland,Italy,Spain,Gr... | 1-2 hour | Stephen Irwin | NaN | ... | 2021-03-04 | NaN | https://www.netflix.com/watch/81078393 | https://www.imdb.com/title/tt2023611 | Inspired by her moms rebellious past and a con... | 63.0 | https://occ-0-4039-1500.1.nflxso.net/dnm/api/v... | https://m.media-amazon.com/images/M/MV5BODYyNW... | NaN | NaN |
5 rows × 29 columns
df.isnull().sum()
Title 0 Genre 1710 Tags 67 Languages 1935 Series or Movie 0 Hidden Gem Score 2101 Country Availability 19 Runtime 1 Director 4708 Writer 4330 Actors 1925 View Rating 7024 IMDb Score 2099 Rotten Tomatoes Score 9098 Metacritic Score 11144 Awards Received 9405 Awards Nominated For 7819 Boxoffice 11473 Release Date 2107 Netflix Release Date 0 Production House 10331 Netflix Link 0 IMDb Link 2303 Summary 9 IMDb Votes 2101 Image 0 Poster 3638 TMDb Trailer 8286 Trailer Site 8286 dtype: int64
df = df.drop(columns = [ 'Metacritic Score', 'Boxoffice', 'Production House', 'Netflix Link', 'IMDb Link',
'Poster', 'TMDb Trailer', 'Trailer Site'], axis = 1)
df['Release Date']= pd.to_datetime(df['Release Date'])
df['Netflix Release Date']= pd.to_datetime(df['Netflix Release Date'])
df['Released_Year'] = pd.DatetimeIndex(df['Release Date']).year
df['Released_Year_Net'] = pd.DatetimeIndex(df['Netflix Release Date']).year
colors = ['black',] * 2
colors[0] = 'crimson'
count = df['Series or Movie'].value_counts()
fig = go.Figure(data=[go.Bar(
x = df["Series or Movie"],
y = count,
text = count,
textposition='auto',
marker_color=colors # marker color can be a single color value or an iterable
)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(title_text= 'Movie or Tv Series ?',
uniformtext_minsize=8, uniformtext_mode='hide',
barmode='group', xaxis_tickangle=-45,
yaxis=dict(
title='Quantity',
titlefont_size=14),
xaxis=dict(
title='Category',
titlefont_size=14))
df_movie = df[df['Series or Movie']=='Movie']
df_movie.head(1)
| Title | Genre | Tags | Languages | Series or Movie | Hidden Gem Score | Country Availability | Runtime | Director | Writer | ... | Rotten Tomatoes Score | Awards Received | Awards Nominated For | Release Date | Netflix Release Date | Summary | IMDb Votes | Image | Released_Year | Released_Year_Net | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | HOW TO BUILD A GIRL | Comedy | Dramas,Comedies,Films Based on Books,British | English | Movie | 7.0 | Canada | 1-2 hour | Coky Giedroyc | Caitlin Moran | ... | 79.0 | 1.0 | NaN | 2020-05-08 | 2021-03-04 | When nerdy Johanna moves to London, things get... | 2838.0 | https://occ-0-1081-999.1.nflxso.net/dnm/api/v6... | 2020.0 | 2021 |
1 rows × 23 columns
df_series = df[df["Series or Movie"] == "Series"]
df_series.head(1)
| Title | Genre | Tags | Languages | Series or Movie | Hidden Gem Score | Country Availability | Runtime | Director | Writer | ... | Rotten Tomatoes Score | Awards Received | Awards Nominated For | Release Date | Netflix Release Date | Summary | IMDb Votes | Image | Released_Year | Released_Year_Net | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Lets Fight Ghost | Crime, Drama, Fantasy, Horror, Romance | Comedy Programmes,Romantic TV Comedies,Horror ... | Swedish, Spanish | Series | 4.3 | Thailand | < 30 minutes | Tomas Alfredson | John Ajvide Lindqvist | ... | 98.0 | 74.0 | 57.0 | 2008-12-12 | 2021-03-04 | A med student with a supernatural gift tries t... | 205926.0 | https://occ-0-4708-64.1.nflxso.net/dnm/api/v6/... | 2008.0 | 2021 |
1 rows × 23 columns
df_series_gen = df_series.dropna(subset=['Genre'])
colors_10 = ['DarkRed', 'FireBrick','Red', 'Crimson', 'IndianRed', 'slategray', 'gray', 'dimgrey', 'DarkSlateGrey', 'black']
series_gen_list = df_series_gen.Genre.str.split(',') #split the list into names
s_gen_list = {} #create an empty list
for genres in series_gen_list: # for any names in series_gen_list
for genre in genres: # for any genre in genres
if (genre in s_gen_list): #if this genre is already present in the s_gen_list
s_gen_list[genre]+=1 # increase his value
else: # else
s_gen_list[genre]=1 # Create his index in the list
s_gen_df = pd.DataFrame(s_gen_list.values(),index = s_gen_list.keys(),
columns = {'Counts of Genres in Tv Series'}) #Create a s_gen_df
s_gen_df.sort_values(by = 'Counts of Genres in Tv Series',ascending = False,inplace = True) #Sort the dataframe in ascending order
top_10_s_gen = s_gen_df[0:10]
fig = go.Figure(data=[go.Bar(
x = top_10_s_gen.index,
y = top_10_s_gen['Counts of Genres in Tv Series'],
text = top_10_s_gen['Counts of Genres in Tv Series'],
textposition='auto',
marker_color=colors_10 # marker color can be a single color value or an iterable
)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(title_text= 'Most Popular in TV Genre',
uniformtext_minsize=8, uniformtext_mode='hide',
yaxis=dict(
title='Quantity',
titlefont_size=14),
xaxis=dict(
title='Genres',
titlefont_size=14))
df_movie_gen = df_movie.dropna(subset=['Genre'])
movie_gen_list = df_movie_gen.Genre.str.split(', ') #split the list into names
m_gen_list = {} #create an empty list
for genres in movie_gen_list: # for any genres in movie_gen_list
for genre in genres: # for any genre in genres
if (genre in m_gen_list): #if this name is already present in the m_gen_list
m_gen_list[genre]+=1 # increase his value
else: # else
m_gen_list[genre]=1 # Create his index in the list
m_gen_df = pd.DataFrame(m_gen_list.values(),index = m_gen_list.keys(),
columns = {'Counts of Genres in Movies'}) #Create a m_gen_df
m_gen_df.sort_values(by = 'Counts of Genres in Movies',ascending = False,inplace = True) #Sort the dataframe in ascending order
top_10_m_gen = m_gen_df[0:10]
fig = go.Figure(data=[go.Bar(
x = top_10_m_gen.index,
y = top_10_m_gen['Counts of Genres in Movies'],
text = top_10_m_gen['Counts of Genres in Movies'],
textposition='auto',
marker_color=colors_10 # marker color can be a single color value or an iterable
)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(title_text= 'Most Popular Movie Genre',
uniformtext_minsize=8, uniformtext_mode='hide',
yaxis=dict(
title='Quantity',
titlefont_size=14),
xaxis=dict(
title='Genres',
titlefont_size=14))
df_series_imdb = df_series.dropna(subset=['IMDb Score'])
df_series_imdb = df_series_imdb.sort_values(by = 'IMDb Score', ascending = False)
top_s_imdb_10_list =df_series_imdb[:10]
fig = go.Figure(data=[go.Bar(
x = top_s_imdb_10_list['Title'],
y = top_s_imdb_10_list['IMDb Score'],
text = top_s_imdb_10_list['IMDb Score'],
textposition='auto',
marker_color=colors_10 # marker color can be a single color value or an iterable
)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(title_text= 'Top Rated Tv Series Rated by IMDB Score?',
uniformtext_minsize=8, uniformtext_mode='hide',
yaxis=dict(
title='IMDb Score',
titlefont_size=14),
xaxis=dict(
title='Titles',
titlefont_size=14))
df_movie_imdb = df_movie.dropna(subset=['IMDb Score'])
df_movie_imdb = df_movie_imdb.sort_values(by = 'IMDb Score', ascending = False)
top_m_imdb_10_list = df_movie_imdb[:10]
fig = go.Figure(data=[go.Bar(
x = top_m_imdb_10_list['Title'],
y = top_m_imdb_10_list['IMDb Score'],
text = top_m_imdb_10_list['IMDb Score'],
textposition='auto',
marker_color=colors_10 # marker color can be a single color value or an iterable
)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(title_text= 'Top Rated Movies Rated by IMDB Rating',
uniformtext_minsize=8, uniformtext_mode='hide',
yaxis=dict(
title='IMDb Score',
titlefont_size=14),
xaxis=dict(
title='Titles',
titlefont_size=14))
##Country of Orgin
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AffinityPropagation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import plotly as py
import plotly.graph_objs as go
import os
py.offline.init_notebook_mode(connected = True)
#print(os.listdir("../input"))
import datetime as dt
import missingno as msno
plt.rcParams['figure.dpi'] = 140
df = pd.read_csv('netflix_titles.csv')
df.head(3)
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | s1 | TV Show | 3% | NaN | João Miguel, Bianca Comparato, Michel Gomes, R... | Brazil | August 14, 2020 | 2020 | TV-MA | 4 Seasons | International TV Shows, TV Dramas, TV Sci-Fi &... | In a future where the elite inhabit an island ... |
| 1 | s2 | Movie | 7:19 | Jorge Michel Grau | Demián Bichir, Héctor Bonilla, Oscar Serrano, ... | Mexico | December 23, 2016 | 2016 | TV-MA | 93 min | Dramas, International Movies | After a devastating earthquake hits Mexico Cit... |
| 2 | s3 | Movie | 23:59 | Gilbert Chan | Tedd Chan, Stella Chung, Henley Hii, Lawrence ... | Singapore | December 20, 2018 | 2011 | R | 78 min | Horror Movies, International Movies | When an army recruit is found dead, his fellow... |
# Missing data
for i in df.columns:
null_rate = df[i].isna().sum() / len(df) * 100
if null_rate > 0 :
print("{} null rate: {}%".format(i,round(null_rate,2)))
director null rate: 30.68% cast null rate: 9.22% country null rate: 6.51% date_added null rate: 0.13% rating null rate: 0.09%
# Replacments
df['country'] = df['country'].fillna(df['country'].mode()[0])
df['cast'].replace(np.nan, 'No Data',inplace = True)
df['director'].replace(np.nan, 'No Data',inplace = True)
# Drops
df.dropna(inplace=True)
# Drop Duplicates
df.drop_duplicates(inplace= True)
df.isnull().sum()
show_id 0 type 0 title 0 director 0 cast 0 country 0 date_added 0 release_year 0 rating 0 duration 0 listed_in 0 description 0 dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 7770 entries, 0 to 7786 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 show_id 7770 non-null object 1 type 7770 non-null object 2 title 7770 non-null object 3 director 7770 non-null object 4 cast 7770 non-null object 5 country 7770 non-null object 6 date_added 7770 non-null object 7 release_year 7770 non-null int64 8 rating 7770 non-null object 9 duration 7770 non-null object 10 listed_in 7770 non-null object 11 description 7770 non-null object dtypes: int64(1), object(11) memory usage: 789.1+ KB
df["date_added"] = pd.to_datetime(df['date_added'])
df['month_added']=df['date_added'].dt.month
df['month_name_added']=df['date_added'].dt.month_name()
df['year_added'] = df['date_added'].dt.year
df.head(3)
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | month_added | month_name_added | year_added | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | s1 | TV Show | 3% | No Data | João Miguel, Bianca Comparato, Michel Gomes, R... | Brazil | 2020-08-14 | 2020 | TV-MA | 4 Seasons | International TV Shows, TV Dramas, TV Sci-Fi &... | In a future where the elite inhabit an island ... | 8 | August | 2020 |
| 1 | s2 | Movie | 7:19 | Jorge Michel Grau | Demián Bichir, Héctor Bonilla, Oscar Serrano, ... | Mexico | 2016-12-23 | 2016 | TV-MA | 93 min | Dramas, International Movies | After a devastating earthquake hits Mexico Cit... | 12 | December | 2016 |
| 2 | s3 | Movie | 23:59 | Gilbert Chan | Tedd Chan, Stella Chung, Henley Hii, Lawrence ... | Singapore | 2018-12-20 | 2011 | R | 78 min | Horror Movies, International Movies | When an army recruit is found dead, his fellow... | 12 | December | 2018 |
# Helper column for various plots
df['count'] = 1
# Many productions have several countries listed - this will skew our results , we'll grab the first one mentioned
# Lets retrieve just the first country
df['first_country'] = df['country'].apply(lambda x: x.split(",")[0])
df['first_country'].head()
# Rating ages from this notebook: https://www.kaggle.com/andreshg/eda-beginner-to-expert-plotly (thank you!)
ratings_ages = {
'TV-PG': 'Older Kids',
'TV-MA': 'Adults',
'TV-Y7-FV': 'Older Kids',
'TV-Y7': 'Older Kids',
'TV-14': 'Teens',
'R': 'Adults',
'TV-Y': 'Kids',
'NR': 'Adults',
'PG-13': 'Teens',
'TV-G': 'Kids',
'PG': 'Older Kids',
'G': 'Kids',
'UR': 'Adults',
'NC-17': 'Adults'
}
df['target_ages'] = df['rating'].replace(ratings_ages)
df['target_ages'].unique()
# Genre
df['genre'] = df['listed_in'].apply(lambda x : x.replace(' ,',',').replace(', ',',').split(','))
# Reducing name length
df['first_country'].replace('United States', 'USA', inplace=True)
df['first_country'].replace('United Kingdom', 'UK',inplace=True)
df['first_country'].replace('South Korea', 'S. Korea',inplace=True)
data = df.groupby('first_country')['count'].sum().sort_values(ascending=False)[:10]
# Plot
color_map = ['#f5f5f1' for _ in range(10)]
color_map[0] = color_map[1] = color_map[2] = '#b20710' # color highlight
fig, ax = plt.subplots(1,1, figsize=(12, 6))
ax.bar(data.index, data, width=0.5,
edgecolor='darkgray',
linewidth=0.6,color=color_map)
#annotations
for i in data.index:
ax.annotate(f"{data[i]}",
xy=(i, data[i] + 150), #i like to change this to roughly 5% of the highest cat
va = 'center', ha='center',fontweight='light', fontfamily='serif')
# Remove border from plot
for s in ['top', 'left', 'right']:
ax.spines[s].set_visible(False)
# Tick labels
ax.set_xticklabels(data.index, fontfamily='serif', rotation=0)
# Title and sub-title
fig.text(0.09, 1, 'Top 10 countries on Netflix', fontsize=15, fontweight='bold', fontfamily='serif')
fig.text(0.09, 0.95, 'The three most frequent countries have been highlighted.', fontsize=12, fontweight='light', fontfamily='serif')
fig.text(1.1, 1.01, 'Insight', fontsize=15, fontweight='bold', fontfamily='serif')
fig.text(1.1, 0.67, '''
The most prolific producers of
content for Netflix are, primarily,
the USA, with India and the UK
a significant distance behind.
It makes sense that the USA produces
the most content as, afterall,
Netflix is a US company.
'''
, fontsize=12, fontweight='light', fontfamily='serif')
ax.grid(axis='y', linestyle='-', alpha=0.4)
grid_y_ticks = np.arange(0, 4000, 500) # y ticks, min, max, then step
ax.set_yticks(grid_y_ticks)
ax.set_axisbelow(True)
#Axis labels
#plt.xlabel("Country", fontsize=12, fontweight='light', fontfamily='serif',loc='left',y=-1.5)
#plt.ylabel("Count", fontsize=12, fontweight='light', fontfamily='serif')
#plt.legend(loc='upper right')
# thicken the bottom line if you want to
plt.axhline(y = 0, color = 'black', linewidth = 1.3, alpha = .7)
ax.tick_params(axis='both', which='major', labelsize=12)
import matplotlib.lines as lines
l1 = lines.Line2D([1, 1], [0, 1], transform=fig.transFigure, figure=fig,color='black',lw=0.2)
fig.lines.extend([l1])
ax.tick_params(axis=u'both', which=u'both',length=0)
plt.show()
country_order = df['first_country'].value_counts()[:11].index
data_q2q3 = df[['type', 'first_country']].groupby('first_country')['type'].value_counts().unstack().loc[country_order]
data_q2q3['sum'] = data_q2q3.sum(axis=1)
data_q2q3_ratio = (data_q2q3.T / data_q2q3['sum']).T[['Movie', 'TV Show']].sort_values(by='Movie',ascending=False)[::-1]
###
fig, ax = plt.subplots(1,1,figsize=(15, 8),)
ax.barh(data_q2q3_ratio.index, data_q2q3_ratio['Movie'],
color='#b20710', alpha=0.8, label='Movie')
ax.barh(data_q2q3_ratio.index, data_q2q3_ratio['TV Show'], left=data_q2q3_ratio['Movie'],
color='#221f1f', alpha=0.8, label='TV Show')
ax.set_xlim(0, 1)
ax.set_xticks([])
ax.set_yticklabels(data_q2q3_ratio.index, fontfamily='serif', fontsize=11)
# male percentage
for i in data_q2q3_ratio.index:
ax.annotate(f"{data_q2q3_ratio['Movie'][i]*100:.3}%",
xy=(data_q2q3_ratio['Movie'][i]/2, i),
va = 'center', ha='center',fontsize=12, fontweight='light', fontfamily='serif',
color='white')
for i in data_q2q3_ratio.index:
ax.annotate(f"{data_q2q3_ratio['TV Show'][i]*100:.3}%",
xy=(data_q2q3_ratio['Movie'][i]+data_q2q3_ratio['TV Show'][i]/2, i),
va = 'center', ha='center',fontsize=12, fontweight='light', fontfamily='serif',
color='white')
fig.text(0.13, 0.93, 'Top 10 countries Movie & TV Show split', fontsize=15, fontweight='bold', fontfamily='serif')
fig.text(0.131, 0.89, 'Percent Stacked Bar Chart', fontsize=12,fontfamily='serif')
for s in ['top', 'left', 'right', 'bottom']:
ax.spines[s].set_visible(False)
#ax.legend(loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.06))
fig.text(0.75,0.9,"Movie", fontweight="bold", fontfamily='serif', fontsize=15, color='#b20710')
fig.text(0.81,0.9,"|", fontweight="bold", fontfamily='serif', fontsize=15, color='black')
fig.text(0.82,0.9,"TV Show", fontweight="bold", fontfamily='serif', fontsize=15, color='#221f1f')
fig.text(1.1, 0.93, 'Insight', fontsize=15, fontweight='bold', fontfamily='serif')
fig.text(1.1, 0.44, '''
Interestingly, Netflix in India
is made up nearly entirely of Movies.
Bollywood is big business, and perhaps
the main focus of this industry is Movies
and not TV Shows.
South Korean Netflix on the other hand is
almost entirely TV Shows.
The underlying resons for the difference
in content must be due to market research
conducted by Netflix.
'''
, fontsize=12, fontweight='light', fontfamily='serif')
import matplotlib.lines as lines
l1 = lines.Line2D([1, 1], [0, 1], transform=fig.transFigure, figure=fig,color='black',lw=0.2)
fig.lines.extend([l1])
ax.tick_params(axis='both', which='major', labelsize=12)
ax.tick_params(axis=u'both', which=u'both',length=0)
plt.show()
##Language
netflix_pd = pd.read_csv("NetflixOriginals.csv")
data_path = "NetflixOriginals.csv"
data = pd.read_csv(data_path)
df = pd.read_csv("NetflixOriginals.csv")
df.head()
| Title | Genre | Premiere | Runtime | IMDB Score | Language | |
|---|---|---|---|---|---|---|
| 0 | Enter the Anime | Documentary | 5-Aug-19 | 58 | 2.5 | English/Japanese |
| 1 | Dark Forces | Thriller | 21-Aug-20 | 81 | 2.6 | Spanish |
| 2 | The App | Science fiction/Drama | 26-Dec-19 | 79 | 2.6 | Italian |
| 3 | The Open House | Horror thriller | 19-Jan-18 | 94 | 3.2 | English |
| 4 | Kaali Khuhi | Mystery | 30-Oct-20 | 90 | 3.4 | Hindi |
common_languages=netflix_pd['Language'].value_counts().reset_index(name='total')
language_list=common_languages[common_languages['total']>=3]['index']
common_languages[common_languages['total']>3].plot.bar(x='index', y='total',rot=90)
<AxesSubplot:xlabel='index'>
## Netflix Runtime
netflix_pd = pd.read_csv("IMDB-Movie-Data.csv")
netflix_pd.head()
| Rank | Title | Genre | Description | Director | Actors | Year | Runtime (Minutes) | Rating | Votes | Revenue (Millions) | Metascore | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Guardians of the Galaxy | Action,Adventure,Sci-Fi | A group of intergalactic criminals are forced ... | James Gunn | Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S... | 2014 | 121 | 8.1 | 757074 | 333.13 | 76.0 |
| 1 | 2 | Prometheus | Adventure,Mystery,Sci-Fi | Following clues to the origin of mankind, a te... | Ridley Scott | Noomi Rapace, Logan Marshall-Green, Michael Fa... | 2012 | 124 | 7.0 | 485820 | 126.46 | 65.0 |
| 2 | 3 | Split | Horror,Thriller | Three girls are kidnapped by a man with a diag... | M. Night Shyamalan | James McAvoy, Anya Taylor-Joy, Haley Lu Richar... | 2016 | 117 | 7.3 | 157606 | 138.12 | 62.0 |
| 3 | 4 | Sing | Animation,Comedy,Family | In a city of humanoid animals, a hustling thea... | Christophe Lourdelet | Matthew McConaughey,Reese Witherspoon, Seth Ma... | 2016 | 108 | 7.2 | 60545 | 270.32 | 59.0 |
| 4 | 5 | Suicide Squad | Action,Adventure,Fantasy | A secret government agency recruits some of th... | David Ayer | Will Smith, Jared Leto, Margot Robbie, Viola D... | 2016 | 123 | 6.2 | 393727 | 325.02 | 40.0 |
netflix_pd.drop_duplicates(subset ="Title", keep = "first", inplace = True)
netflix_runtime = netflix_pd[["Title","Year","Rating","Runtime (Minutes)"]]
netflix_runtime = netflix_runtime.dropna(how="any")
bins = [0, 59.99, 74.99, 89.99, 104.99, 119.99, 134.99, 149.99, 300]
bin_names = ["Less than 60", "60-75", "75-90", "90-105", "105-120", "120-135", "135-150", "More than 150"]
netflix_pd["Runtime Group (Minutes)"] = pd.cut(netflix_runtime["Runtime (Minutes)"], bins, labels=bin_names, include_lowest=True)
# theres a better way to do this BUT...
# get the mean rating of each bin
less_hour_rating = netflix_pd.loc[netflix_pd["Runtime Group (Minutes)"] == "Less than 60", "Rating"]
hour_rating = netflix_pd.loc[netflix_pd["Runtime Group (Minutes)"] == "60-75", "Rating"]
hour_15_rating = netflix_pd.loc[netflix_pd["Runtime Group (Minutes)"] == "75-90", "Rating"]
hour_30_rating = netflix_pd.loc[netflix_pd["Runtime Group (Minutes)"] == "90-105", "Rating"]
hour_45_rating = netflix_pd.loc[netflix_pd["Runtime Group (Minutes)"] == "105-120", "Rating"]
two_hour_rating = netflix_pd.loc[netflix_pd["Runtime Group (Minutes)"] == "120-135", "Rating"]
two_hour_15_rating = netflix_pd.loc[netflix_pd["Runtime Group (Minutes)"] == "135-150", "Rating"]
over_hour_rating = netflix_pd.loc[netflix_pd["Runtime Group (Minutes)"] == "More than 150", "Rating"]
less_hour_rating_mean = less_hour_rating.mean()
hour_rating_mean = hour_rating.mean()
hour_15_rating_mean = hour_15_rating.mean()
hour_30_rating_mean = hour_30_rating.mean()
hour_45_rating_mean = hour_45_rating.mean()
two_hour_rating_mean = two_hour_rating.mean()
two_hour_15_rating_mean = two_hour_15_rating.mean()
over_hour_rating_mean = over_hour_rating.mean()
# define parameters
labels = ["Less than 60", "60-75", "75-90", "90-105", "105-120", "120-135", "135-150", "More than 150"]
ratings = [0, hour_rating_mean, hour_15_rating_mean, hour_30_rating_mean, hour_45_rating_mean, two_hour_rating_mean, two_hour_15_rating_mean, over_hour_rating_mean]
# graph bar chart
plt.bar(labels, ratings, color="red", alpha=0.5, align="center")
plt.title("Runtime Ratings")
plt.xlabel("Runtime Bins (Minutes)")
plt.ylabel("Ratings")
plt.ylim(0, 10)
plt.xticks(rotation=75)
([0, 1, 2, 3, 4, 5, 6, 7], [Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, '')])